Dataset downloaded from mgandal’s github repository.
# Load csvs
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')
# 1. Group brain regions by lobes
# 2. Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers,
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
# 3. Transform Diagnosis into a factor variable
datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>%
mutate(Brain_lobe = ifelse(Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45'), 'Frontal',
ifelse(Brain_Region %in% c('BA3_1_2_5', 'BA7'), 'Parietal',
ifelse(Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22'), 'Temporal',
'Occipital')))) %>%
mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)),
Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>%
dplyr::select(-Diagnosis_)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label as neuronal the genes that make a match
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
rm(GO_annotations)
Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 63682 genes from 88 samples belonging to 41 different subjects."
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
## Cache found
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
datGenes$length = datGenes$end_position-datGenes$start_position
rm(getinfo, mart)
# 1. Filter genes with start or end position missing
to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
# 2. Filter genes that do not encode any protein
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
# 3. Filter genes with low expression levels
# 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
print(paste0('Filtered dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr),
' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Filtered dataset includes 19426 genes from 88 samples belonging to 41 different subjects."
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "967 SFARI genes remaining"
# Save datasets before level of expression filtering
datExpr_original = datExpr
datGenes_original = datGenes
datMeta_original = datMeta
Filtering outlier samples (there aren’t many)
Creating DESeq object and normalising using vst transformation
Threshold: Mean expression value
thresholds = c(0, 0.1, seq(0.2, 2, 0.2), 2.5, 3, 5, 7.5, 10)
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
cat(paste0('\n\nFiltering with threshold: ', threshold,'\n'))
to_keep = rowMeans(datExpr)>threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
cat(paste0('Removing ', sum(!to_keep), ' samples\n'))
rm(absadj, netsummary, ku, z.ku, to_keep)
# Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis)
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
} else {
new_entries = data.frame('threshold'=threshold, 'ID'=rownames(datExpr_vst),
'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
##
##
## Filtering with threshold: 0
## alpha: 1.000000
## Removing 7 samples
##
##
## Filtering with threshold: 0.1
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.4
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.6
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 0.8
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.4
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.6
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 1.8
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 2
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 2.5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 3
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 7.5
## alpha: 1.000000
## Removing 6 samples
##
##
## Filtering with threshold: 10
## alpha: 1.000000
## Removing 6 samples
# Plot Mean vs SD
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())